library(tidyverse)
library(sf)
library(ggrepel)
library(plotly)
library(ggtext)
library(ggpubr)
marathon <- read_csv("../data/marathon_results_2017.csv")
flCounties <- st_read("../data/Florida_Counties/Florida_Counties.shp", quiet = T)
usCities <- read_csv("../data/uscities.csv")
# check data
head(marathon)
## # A tibble: 6 × 22
## Bib Name Age `M/F` City State Country `5K` `10K` `15K` `20K` Half
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <time> <chr> <chr> <chr> <time>
## 1 11 Kirui… 24 M Keri… <NA> KEN 15'25" 0:30… 0:45… 1:01… 01:04:35
## 2 17 Rupp,… 30 M Port… OR USA 15'24" 0:30… 0:45… 1:01… 01:04:35
## 3 23 Osako… 25 M Mach… <NA> JPN 15'25" 0:30… 0:45… 1:01… 01:04:36
## 4 21 Biwot… 32 M Mamm… CA USA 15'25" 0:30… 0:45… 1:01… 01:04:45
## 5 9 Chebe… 31 M Mara… <NA> KEN 15'25" 0:30… 0:45… 1:01… 01:04:35
## 6 15 Abdir… 40 M Phoe… AZ USA 15'25" 0:30… 0:45… 1:01… 01:04:35
## # … with 10 more variables: `25K` <time>, `30K` <time>, `35K` <time>,
## # `40K` <time>, Pace <time>, `Proj Time` <chr>, `Official Time` <time>,
## # Overall <dbl>, Gender <dbl>, Division <dbl>
I want to focus on Florida
# get just people from Florida
flMar <- filter(marathon, State == "FL")
# check data
head(flMar)
## # A tibble: 6 × 22
## Bib Name Age `M/F` City State Country `5K` `10K` `15K` `20K` Half
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <time> <chr> <chr> <chr> <time>
## 1 148 Vanos… 45 M Orla… FL USA 17'42" 0:35… 0:53… 1:11… 01:15:08
## 2 358 Byers… 36 M Miami FL USA 18'14" 0:36… 0:55… 1:13… 01:17:25
## 3 470 Shold… 36 M Jack… FL USA 18'20" 0:36… 0:55… 1:13… 01:17:42
## 4 333 Sloan… 29 M Jack… FL USA 18'07" 0:36… 0:54… 1:12… 01:15:59
## 5 2123 Hartj… 24 M Nept… FL USA 20'21" 0:40… 1:00… 1:19… 01:24:14
## 6 176 Huber… 38 M Miam… FL USA 18'08" 0:36… 0:54… 1:13… 01:17:27
## # … with 10 more variables: `25K` <time>, `30K` <time>, `35K` <time>,
## # `40K` <time>, Pace <time>, `Proj Time` <chr>, `Official Time` <time>,
## # Overall <dbl>, Gender <dbl>, Division <dbl>
# change variable names to make them easier for using
names(flMar) <- make.names(names(flMar))
# remove any rows that could have been duplicated
flMar <- distinct(flMar)
# select just columns I might use
flMar <- select(flMar, c("Age", "M.F", "City", "State", "Official.Time", "Overall", "Gender"))
# check for NAs
colSums(is.na(flMar))
## Age M.F City State Official.Time
## 0 0 0 0 0
## Overall Gender
## 0 0
# check data
flMar
## # A tibble: 609 × 7
## Age M.F City State Official.Time Overall Gender
## <dbl> <chr> <chr> <chr> <time> <dbl> <dbl>
## 1 45 M Orlando FL 02:34:40 94 84
## 2 36 M Miami FL 02:35:44 104 92
## 3 36 M Jacksonville FL 02:36:40 122 106
## 4 29 M Jacksonville FL 02:40:03 188 167
## 5 24 M Neptune Beach FL 02:46:56 416 386
## 6 38 M Miami Beach FL 02:47:06 424 393
## 7 37 M Tallahassee FL 02:47:31 448 417
## 8 27 M Miami FL 02:47:35 453 421
## 9 30 M Jacksonville FL 02:49:21 536 499
## 10 34 M Saint Petersburg FL 02:49:31 549 512
## # … with 599 more rows
This dataset will only be used to find the county a city is in
# narrow dataset down to just Florida and needed columns
flCities <- usCities %>%
filter(state_id == "FL") %>%
select(c("city", "county_name"))
# add county name to marathon data check for failed matches
left_join(flMar, flCities, c("City" = "city")) %>%
filter(is.na(county_name))
## # A tibble: 67 × 8
## Age M.F City State Official.Time Overall Gender county_name
## <dbl> <chr> <chr> <chr> <time> <dbl> <dbl> <chr>
## 1 34 M Saint Petersburg FL 02:49:31 549 512 <NA>
## 2 43 M St Petersburg FL 02:52:16 711 669 <NA>
## 3 42 F Lowell FL 03:03:01 1772 120 <NA>
## 4 39 M Land O Lakes FL 03:09:06 2626 2392 <NA>
## 5 40 M Saint Augustine FL 03:15:53 3833 3377 <NA>
## 6 47 M St Johns FL 03:16:03 3861 3399 <NA>
## 7 37 M Lake Worth FL 03:16:30 3950 3470 <NA>
## 8 36 M Port Saint Lucie FL 03:19:24 4533 3913 <NA>
## 9 41 M Saint Johns FL 03:20:00 4654 4005 <NA>
## 10 41 F Saint Petersburg FL 03:25:21 5812 988 <NA>
## # … with 57 more rows
# Converting and fixing city names to match names in flCities
flMar$City <- str_replace(flMar$City, "Saint", "St.")
flMar$City <- str_replace(flMar$City, "St ", "St. ")
flMar$City <- str_replace(flMar$City, " O ", " O' ")
flMar$City <- str_replace(flMar$City, " Locka", "-locka")
flMar$City <- str_replace(flMar$City, "Ft.", "Fort ")
flMar$City <- str_replace(flMar$City, "Fort ", "Fort ") # one fort had a extra space
flMar$City <- str_replace(flMar$City, "Deland", "DeLand")
flMar$City <- str_replace(flMar$City, "Augustime", "Augustine")
# check for failed matches again
left_join(flMar, flCities, c("City" = "city")) %>%
filter(is.na(county_name)) %>%
group_by(City) %>%
summarize(n = n())
## # A tibble: 17 × 2
## City n
## <chr> <int>
## 1 Amelia Island 1
## 2 Clearwater Beach 1
## 3 Deleon Springs 1
## 4 Lake Worth 3
## 5 Lithia 2
## 6 Lowell 1
## 7 Palm Beach Shore 1
## 8 Parrish 1
## 9 Pensacola Beach 2
## 10 Ponte Vedra 1
## 11 Ponte Vedra Beach 4
## 12 Santa Rosa Beach 4
## 13 St. Johns 8
## 14 Tegucigalpa Honduras 1
## 15 Viera 1
## 16 Watersound 1
## 17 West Palm 1
These cities were not found in the flCities data. Due to time constraints and the small amount of these cities with missing counties, their county will not be manually found and added.
# add counties to marathon data for rows with city matches
flMarCounties <- inner_join(flMar, flCities, c("City" = "city"))
# change county column name to match flCounties country column
flMarCounties <- rename(flMarCounties, "COUNTYNAME" = "county_name")
flMarCounties
## # A tibble: 575 × 8
## Age M.F City State Official.Time Overall Gender COUNTYNAME
## <dbl> <chr> <chr> <chr> <time> <dbl> <dbl> <chr>
## 1 45 M Orlando FL 02:34:40 94 84 Orange
## 2 36 M Miami FL 02:35:44 104 92 Miami-Dade
## 3 36 M Jacksonville FL 02:36:40 122 106 Duval
## 4 29 M Jacksonville FL 02:40:03 188 167 Duval
## 5 24 M Neptune Beach FL 02:46:56 416 386 Duval
## 6 38 M Miami Beach FL 02:47:06 424 393 Miami-Dade
## 7 37 M Tallahassee FL 02:47:31 448 417 Leon
## 8 27 M Miami FL 02:47:35 453 421 Miami-Dade
## 9 30 M Jacksonville FL 02:49:21 536 499 Duval
## 10 34 M St. Petersburg FL 02:49:31 549 512 Pinellas
## # … with 565 more rows
# make flCounties county capitalization match flMarCounties
flCounties$COUNTYNAME <- str_to_title(flCounties$COUNTYNAME)
# match this county name
flCounties$COUNTYNAME <- replace(flCounties$COUNTYNAME, flCounties$COUNTYNAME == "Dade", "Miami-Dade")
# add the counties from the shapefile that had no runners and put a 0 for their number of runners.
flMarCounties <- flMarCounties %>%
group_by(COUNTYNAME) %>%
summarise(n = n()) %>%
right_join(flCounties, "COUNTYNAME") %>%
mutate(n = ifelse(is.na(n), 0, n)) %>%
select(c("COUNTYNAME", "n"))
arrange(flMarCounties, desc(n))
## # A tibble: 67 × 2
## COUNTYNAME n
## <chr> <dbl>
## 1 Miami-Dade 82
## 2 Hillsborough 48
## 3 Palm Beach 47
## 4 Orange 46
## 5 Broward 41
## 6 Duval 38
## 7 Pinellas 32
## 8 Pasco 22
## 9 Seminole 19
## 10 Brevard 18
## # … with 57 more rows
ggplotly(ggplot(flMar) +
geom_boxplot(aes(M.F, Overall, fill = M.F, color = M.F)) +
scale_fill_manual(values = c(F = "lightcoral", M = "lightskyblue"), guide = "none") +
scale_color_manual(values = c(F = "Red", M = "Blue"), guide = "none") +
scale_y_reverse() + # also makes the numbers negative
labs(title = "<span style = 'color:Blue;'>Male</span> and <span style='color:Red;'>Female</span> Overall Rank for 2017 Boston Marathon Finishers from Florida") +
theme_minimal() +
theme(plot.title = element_markdown(), axis.title = element_blank(), axis.text.x = element_blank(), legend.position = "none"))
The best and worst finishers are similar for both genders, but on average males performed better.
# get coordinates for county labeling
flCountiesPoints <- st_centroid(flCounties)
flCountiesPoints <- cbind(flCountiesPoints, st_coordinates(st_centroid(flCountiesPoints$geometry)))
# filter just counties with over 20 finishers
# and split into left and right side for finer control on map
leftPoints <- filter(flCountiesPoints, COUNTYNAME %in% c("Hillsborough", "Pinellas", "Pasco"))
rightPoints <- filter(flCountiesPoints, COUNTYNAME %in% c("Miami-Dade","Palm Beach", "Orange", "Broward", "Duval"))
# spatial visualization
merge(flCounties, flMarCounties, by = "COUNTYNAME") %>%
ggplot() +
geom_sf(aes(fill = n),alpha=0.8, col="white") +
scale_fill_viridis_c(name = "") +
geom_text_repel(data = leftPoints, aes(x = X, y = Y, label = COUNTYNAME), nudge_x = -1.5, min.segment.length = 0) +
geom_text_repel(data = rightPoints, aes(x = X, y = Y, label = COUNTYNAME), nudge_x = 1.5, min.segment.length = 0) +
annotate("text", -Inf, Inf, label = "82 finishers", hjust = -6.55, vjust = 36.5, size = 3) +
labs(title = "2017 Boston Marathon Finishers by County in Florida", subtitle = "Counties with at least 20 finishers are labeled") +
theme_void() +
theme(plot.title = element_markdown(), plot.subtitle = element_markdown(), legend.position = "left")
It is interesting to see most finishers are from southeast Florida with a significant amount in central Florida, and Miami-Dade having a lot more than the rest.
ggplot(flMar, aes(Age, Official.Time, color = M.F)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm") +
scale_color_manual(values = c(F = "Red", M = "Blue"), guide_legend(title = "Gender")) +
stat_regline_equation(aes(label = after_stat(rr.label)), show.legend = F) +
labs(title = "Age vs Official Time of 2017 Boston Marathon Finishers from Florida") +
xlab("Age (Year)") +
ylab("Official Time (HH:MM:SS)") +
theme_minimal() +
theme(plot.title = element_markdown())
## `geom_smooth()` using formula = 'y ~ x'
The correlation between age and the official time achieved is not strong. But the linear model does suggest that older people take longer to finish and more so for males.
My process started started with choosing a dataset. I decided to go with the 2017 Boston Marathon Finishers because it has a good amount of data, multiple variables that I could use, and it has city and state data that could be combined with a shapefile to make a spatial visualization. The dataset was decently formatted and I did not have to do much cleaning. Main changes made were renaming the columns for easier use, creating a subset of just the finishers from Florida with the columns I was going to use, and checking for missing values, which my subset had none of. I was able to make my originally planned charts which were an interactive boxplot, a spatial visualization of Florida counties, and a scatterplot with a linear model.
The interactive boxplot was the first graph I made. It was the gender
of the finisher vs their overall rank. For this graph, I tried to keep
it simple, so the only labels are the title and the rank values on the y
axis. The title has colored words in it to match the gender for each
boxplot. I also reversed the y axis, so that 0 is at the top. I think
that this makes more sense as a rank closer to 0 is a better performance
and I think better numbers should be at the top for a graph. One problem
with this though is that hovering over the boxplot after making it
interactive through plotly, shows a negative sign in front
of the statistics for it. I could not find a solution for this, and I
also had a problem where hjust was not working for this
boxplot. However, the graph still looked fine and is able to clearly
convey the information I choose from the dataset. The best and worst
finishers are similar for both genders, but on average males performed
better.
My next graph, the spatial visualization was difficult to make work the way I wanted. It shows the number of finishers per county in Florida, and I labeled the counties with over 20 finishers. My main issue is that I have the city of the finisher from marathon dataset but not their county. So, I found a freely available dataset that has us cities and their counties. I filtered that data down to just Florida cities and did a left join on it with my marathon data to check for missing matches. A large number of failed matches were due to inconsistent spelling and typos in the city name for the marathon data. After fixing those, I did an inner join to get just the matches, as there was only a small amount of cities left with no counties found for it in the US cities dataset. Then I had to do column renaming and changing a city name so that I could merge it with shapefile for Florida counties. I grouped my data by counties and counting the number of occurrences of county and merged that into shapefile for the Florida counties. Finally, with that combined data, I was able to make the spatial visualization. Using it, I saw that most finishers are from southeast Florida with a significant amount in central Florida, and Miami-Dade having a lot more than the rest.
My final graph was a scatterplot of the age vs official time for the
finishers. The points were colored by gender and I added a linear model
for both as well as their R-squared value. This was easy to make through
the cleaned data I made before this. My only difficulty was adding the
R-squared value, and I used stat_regline_equation from the
ggpubr package to make it easier. Also, to make it easier
to see the points at cluttered areas, I lowered the alpha of the points.
Contrary to my previous graphs, I kept the axis titles to help explain
their value types as I thought they were less clear then the others and
putting the value types in the title would make my already long title
too long. This graph showed that the correlation between age and the
official time achieved is not strong. But the linear model does suggest
that older people take longer to finish and more so for males.
Overall, for my graphs, I tried to keep them consistent. I removed any information I believed was unnecessary and repetitive. I kept the colors as consistent as I could. For both my boxplot and scatterplot that had gender in them, male was blue, and females were red. The only graph with different colors was the spatial visualization as it has a continuous variable, and I used the Viridis color palette for it, as it colors for it help show similarities and differences well without overemphasis.